Introduction

This is a tutorial script to producing an interactive visualization of words in a publication list extracted from Google Scholar. The appearance of the words is dependent on the frequency of their occurrence, and on which words they co-occur with in the titles of the listed articles.

The code is for R and it is, among others, based on packages ggplot2, plotly and htmlWidgets.

This document is part of the Github repository ScholarWordGraph by Tommi Suvitaival.

View this document at https://tommi-s.com/ScholarWordGraph/ to show all output correctly.

Preparations

  1. Open a Google Scholar profile with browser (e.g., https://scholar.google.com/citations?user=SGgP0VQAAAAJ ).
  2. Go to the bottom of the page and click “SHOW MORE” to show all entries on the page.
  3. Go again to the bottom of the newly-expanded page.
  4. Select everything in the table, starting from the year of the last entry (on the bottom-right corner) and ending at the title of the first entry (on the top-left corner).
  5. Copy the selected text.
  6. Paste the text to a text editor and save as “data/publications.txt” (see example in the present repository).

Load the Data

data.loaded <- 
  readr::read_delim(
    file = "data/publications.txt",
    delim = "\t",
    escape_double = FALSE,
    col_names = FALSE,
    trim_ws = TRUE
  )
## Rows: 96 Columns: 1
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (1): X1
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Format the Data

Create a Data Frame of Articles

  • Create a matrix with articles as rows and title, authors and journal as the three columns.
  • Convert to a data frame.
data <- unlist( data.loaded )
## Warning: One or more parsing issues, see `problems()` for details
data <-
  matrix(
    data = unlist( data ),
    ncol = 3,
    byrow = TRUE
  )

colnames( data ) <-
  c(
    "Title",
    "Authors",
    "Journal"
  )

data <-
  data.frame(
    data,
    stringsAsFactors = FALSE
  )

Convert Titles to Lower Case

  • To match identical words with different case, convert all to lowercase.
data$"Title.lower" <- tolower( data$"Title" )

Define Multi-Word Terms

  • Define word sequences that should be considered as a single term by adding replacing whitespace with underscore.
  • E.g., “type 1 diabetes” is one term.
data$"Title.lower" <-
  stringr::str_replace_all(
    string = data$"Title.lower",
    pattern = "type 1 diabetes",
    replacement = "type_1_diabetes"
  )

data$"Title.lower" <-
  stringr::str_replace_all(
    string = data$"Title.lower",
    pattern = "type 2 diabetes",
    replacement = "type_2_diabetes"
  )

Extract Words from the Titles

words <-
  stringr::str_split(
    string = data[ , "Title.lower" ],
    pattern = "\\s"
  )

Omit Special Characters

  • Omit punctuation and special characters from the titles.
words <- 
  lapply(
    X = words,
    FUN = tolower
  )

words <-
  lapply(
    X = words,
    FUN = stringr::str_replace_all,
    pattern = "(\\()|(\\))|(\\:)|(\\,)|(\\.)",
    replacement = ""
  )

table.words <- 
  sort(
    x = table( unlist( words ) ),
    decreasing = TRUE
  )

head( table.words )
## 
##   in   of  and  the with  for 
##   25   21   17   13   11    8

Extract Unique Words

  • List all unique words that appear in the titles.
words.unique <-
  sort(
    unique(
      unlist( words ) )
  )

Define List of Blocked Words

  • List general words that are non-informative.
  • Add plain numbers to this list.
blocklist <-
  c(
    "and",
    "an",
    "after",
    "are",
    "as",
    "at",
    "based",
    "by",
    "do",
    "during",
    "for",
    "from",
    "in",
    "is",
    "of",
    "on",
    "not",
    "the",
    "to",
    "through",
    "with",
    "without",
    "...",
    letters
  )

blocklist <- 
  c(
    blocklist,
    words.unique[ grepl( x = words.unique, pattern = "^[0-9]+$" ) ]
  )

Omit Blocked Words

  • Omit the blocked words from the words list of each article.
words <-
  lapply(
    X = words,
    FUN = function( x ) {
      x[ !( x %in% blocklist ) ]
    }
  )

table.words <- 
  sort(
    x = table( unlist( words ) ),
    decreasing = TRUE
  )

head( table.words )
## 
## type_1_diabetes        analysis      associated          plasma            data 
##               6               5               5               5               4 
##      lipidomics 
##               4

Re-Extract Unique Words

words.unique <-
  sort(
    unique(
      unlist( words ) )
  )

Create the Words-by-Articles Occurrence Matrix

wba <-
  array(
    data = 0,
    dim = c( length( words.unique ), nrow( data ) )
  )

rownames( wba ) <- words.unique
colnames( wba ) <- data[ , "Title" ]

for ( i in 1:nrow( data ) ) {
  
  wba[ words[[ i ]], i ] <- 1
  
}

Log-Transform the Occurrences

  • Trasform the occurrences with log (1+x) for a more Gaussian-like distribution.
wba.log1px <- log10( 1 + wba )

Autoscale the Occurrence Profiles

  • Scale the data for PCA.
wba.norm <- scale( wba.log1px )

Principal Component Analysis

Compute PCA

result.pca <-
  princomp(
    x = wba.norm
  )

Plot PCA

library( "ggfortify" )
## Warning: package 'ggfortify' was built under R version 4.0.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
plot <-
  autoplot( 
    result.pca
    # ,
    # loadings = TRUE,
    # loadings.label = TRUE
  )

print( plot )

Cluster the Words

result.clustering <-
  cluster::pam(
    x = result.pca$scores[ , 1:2 ], # based on the scores on the first two principal components
    # x = wba.norm, # alternatively, based on the original high-dimensional data
    k = 9
  )

Figure

Prepare Results for the Visualization

  • Extract PCA-scores of the two first components.
  • Extract cluster assignments for each data point.
  • Compute each data point’s Euclidean distance from the origin.
  • Wrap multi-word terms to multiple lines for better visual impression.
  • Create a text string listing all the article titles, where the term appears.
data.plot <-
  data.frame(
    Word = rownames( result.pca$"scores" ),
    result.pca$"scores"
  )

data.plot$"Cluster" <- 
  factor(
    x = result.clustering$"clustering",
    levels = 1:length( unique( result.clustering$"clustering" ) ),
    labels = 
      c(
        "Cluster (C) 1",
        paste( "C", 2:length( unique( result.clustering$"clustering" ) ) )
      )
  )
  
data.plot$"Distance" <- 
  sqrt( data.plot$"Comp.1"^2 + data.plot$"Comp.2"^2 ) + 1

data.plot$"Term" <-
  stringr::str_replace_all(
    string = data.plot$"Word",
    pattern = "\\-",
    replacement = "-\n"
  )

data.plot$"Term" <-
  stringr::str_replace_all(
    string = data.plot$"Word",
    pattern = "\\_",
    replacement = "\n"
  )

data.plot$"Publications" <-
  apply(
    X = wba[ data.plot$"Word", ] == 1,
    MAR = 1,
    FUN = function( x ) {
      paste(
        "\n",
        names(
          which( x )
        ),
        collapse = "\n"
      )
    }
  )

data.plot <- 
  data.plot[ 
    order( data.plot$"Distance", decreasing = FALSE ),
  ]

Define the Figure

  • Visualization with ggplot2.
  • PCA components 1 and 2, respectively, on the y- and x-axes.
  • Term name as text label.
  • Distance from origin as (increasing) size.
  • Cluster assignment as color.
  • Add jitter to reduce overlap of similar terms.
  • Use pseudo-log-valued axes for reduced outliers.
jitter <- 1

plot <-
  ggplot2::ggplot(
    data = data.plot,
    mapping =
      ggplot2::aes(
        x = Comp.2,
        y = Comp.1,
        label = Term,
        size = Distance,
        text = Publications
        ,
        color = Cluster,
      )
  ) +
  ggplot2::geom_text(
    position =
      ggplot2::position_jitter(
        height = jitter,
        width = jitter
      )
  ) +
  ggplot2::scale_x_continuous( trans = "pseudo_log" ) +
  ggplot2::scale_y_continuous( trans = "pseudo_log" ) +
  ggplot2::theme(
    axis.text = ggplot2::element_blank(),
    axis.ticks = ggplot2::element_blank()
  ) +
  ggplot2::xlab( label = "\n\n\nMap of my research topics" ) +
  ggplot2::ylab( label = "" ) +
  ggplot2::scale_color_brewer( palette = "Set1" )

Show the (Passive) Figure

plot

Create the Interactive Figure

  • Make ggplot2 figure interactive with the ggplotly-function from the plotly-package.
  • Show the term and the articles, where it appears, as text (tooltip) when hovering over a data point.
  • Use the layout-function for fine-tuning of the legend.
library( tidyr )

plot.interactive <-
  plotly::ggplotly( 
    p = plot,
    tooltip = c( "label", "Publications" )
  ) %>%
  plotly::layout(
    legend = 
      list(
        orientation = "h",
        title = list( text = "Cluster<br>" ),
        x = quantile( x = data.plot$"Comp.2", probs = 0.67 ),
        y = min( data.plot$"Comp.1" )
      )
  )

View the Interactive Figure

  • Words are shown on the map in increasing size with regard their PCA-score.
  • Hover over a word to reveal, which article titles it occurs in.
  • Clusters shown with distinct colors.
  • Double-click a cluster (Cs) in the legend to show it alone (and double-click again to return).
  • Single-click a cluster to hide it (and click again to return).
  • Select area to zoom in the map (and double click the map area to return).
plot.interactive

Export HTML Widget

  • Create a stand-alone html-widget of the figure with the partial_bundle function from the plotly-package.
  • Save the widget with the saveWidget-function from the htmlWidgets-package.

(Uncomment for saving the files.)

widget <- plotly::partial_bundle( plot.interactive )

# dir.create( "output" )
# 
# htmlwidgets::saveWidget( widget, "output/index.html" )

View the Exported Widget


<iframe 
  id="igraph" 
  scrolling="no" 
  style="border:none;" 
  seamless="seamless" 
  src="output/index.html" 
  height="800" 
  width="800"
>
widget
</iframe>

Figure with a Dark Layout

Define the Figure

  • Create the same figure with a dark layout, suitable for dark webpages.
plot.dark <-
  plot +
  ggplot2::theme(
    axis.text = ggplot2::element_blank(),
    axis.ticks = ggplot2::element_blank(),
    axis.title = ggplot2::element_text( color = "gray" ),
    legend.background = ggplot2::element_rect( color = NA, fill = "black" ),
    legend.key = ggplot2::element_rect( color = "gray",  fill = "gray" ),      
    legend.text = ggplot2::element_text( color = "gray" ),
    legend.title = ggplot2::element_text( color = "gray" ),
    panel.background = ggplot2::element_rect( fill = "black", color  = NA ),
    panel.grid = ggplot2::element_blank(),
    plot.background = ggplot2::element_rect( color = "black", fill = "black" )
  )

Show the (Passive) Figure

plot.dark

Create the Interactive Figure

library( tidyr )

plot.dark.interactive <-
  plotly::ggplotly( 
    p = plot.dark,
    tooltip = c( "label", "Publications" )
  ) %>%
  plotly::layout(
    legend = 
      list(
        orientation = "h",
        title = list( text = "Cluster<br>" ),
        x = quantile( x = data.plot$"Comp.2", probs = 0.67 ),
        y = min( data.plot$"Comp.1" )
      )
  )

View the Interactive Figure

plot.dark.interactive

Export HTLM Widget

(Uncomment for saving the files.)

widget <- plotly::partial_bundle( plot.dark.interactive )

# dir.create( "output-dark" )
# 
# htmlwidgets::saveWidget( widget, "output-dark/index.html" )

View the Exported Widget


<iframe 
  id="igraph" 
  scrolling="no" 
  style="border:none;" 
  seamless="seamless" 
  src="output-dark/index.html" 
  height="800" 
  width="800"
>
widget
</iframe>

SessionInfo

utils::sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19042)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tidyr_1.1.2      ggfortify_0.4.12 ggplot2_3.3.5   
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.1   xfun_0.27          purrr_0.3.4        colorspace_2.0-0  
##  [5] vctrs_0.3.8        generics_0.1.0     htmltools_0.5.1.1  viridisLite_0.4.0 
##  [9] yaml_2.2.1         utf8_1.1.4         plotly_4.9.4.1     rlang_0.4.11      
## [13] jquerylib_0.1.4    pillar_1.6.2       glue_1.4.2         withr_2.4.2       
## [17] bit64_4.0.5        RColorBrewer_1.1-2 lifecycle_1.0.0    stringr_1.4.0     
## [21] munsell_0.5.0      gtable_0.3.0       htmlwidgets_1.5.4  evaluate_0.14     
## [25] labeling_0.4.2     knitr_1.34         tzdb_0.1.2         crosstalk_1.1.1   
## [29] curl_4.3           parallel_4.0.4     fansi_0.4.2        highr_0.9         
## [33] readr_2.0.1        scales_1.1.1       vroom_1.5.5        jsonlite_1.7.2    
## [37] farver_2.1.0       bit_4.0.4          gridExtra_2.3      hms_1.1.0         
## [41] digest_0.6.27      stringi_1.5.3      dplyr_1.0.4        grid_4.0.4        
## [45] cli_3.0.1          tools_4.0.4        magrittr_2.0.1     lazyeval_0.2.2    
## [49] tibble_3.0.6       cluster_2.1.2      crayon_1.4.1       pkgconfig_2.0.3   
## [53] ellipsis_0.3.2     data.table_1.13.6  rmarkdown_2.11     httr_1.4.2        
## [57] rstudioapi_0.13    R6_2.5.1           compiler_4.0.4

Appendix

if ( file.exists( "README.html" ) ) {
  
  system( command = "rm index.html" )
  system( command = "cp README.html index.html" )
  
}
## [1] 0